In [3]:
import pandas as pd
import numpy as np
import os
import glob
import nltk.data
import nltk, re, pprint
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.metrics.pairwise import linear_kernel
import sqlite3



def connect_db():
	return sqlite3.connect('/Users/sheldon/podcasts/test.db')

def create_df_object():
	conn = sqlite3.connect('/Users/sheldon/podcasts/test.db')
	df = pd.read_sql("select * from podcast",conn)
	return df

df = create_df_object()

stop = set(stopwords.words('english'))

In [6]:
#df.head()
import psycopg2
import sys
from sqlalchemy import create_engine
engine = create_engine('postgresql://sheldon@localhost:5432/sheldon')
df1 = pd.read_sql("select * from podcasts",engine)

In [7]:
df1.query("select *")


  File "<unknown>", line 1
    select *
            ^
SyntaxError: invalid syntax

In [3]:
def remove_stop_words(row):
    tokens = word_tokenize(str(row))
    tokens = [w for w in tokens if not w in stop]
    tokens = [word for word in tokens if not "'" in word]
    return ' '.join(tokens)

df['transcribed'] = df['transcribed'].apply(remove_stop_words)
texts = df.transcribed.tolist()

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] +=1

In [4]:
from gensim import corpora, models, similarities
import gensim

In [4]:
'''class MyCorpus(object):
    def __iter__(self):
        for doc in docs:
            yield dictionary.doc2bow(doc.split())
corpus_mem_friendly = MyCorpus()
corpora.MmCorpus.serialize('corpus.mm',corpus_mem_friendly)
dictionary.save('words.dict')
df["review_text"] = df["transcribed"].map(lambda x: x.split(' '))
from gensim import corpora
dictionary = corpora.Dictionary(df["review_text"])
'''


Out[4]:
'class MyCorpus(object):\n    def __iter__(self):\n        for doc in docs:\n            yield dictionary.doc2bow(doc.split())\ncorpus_mem_friendly = MyCorpus()\ncorpora.MmCorpus.serialize(\'corpus.mm\',corpus_mem_friendly)\ndictionary.save(\'words.dict\')\ndf["review_text"] = df["transcribed"].map(lambda x: x.split(\' \'))\nfrom gensim import corpora\ndictionary = corpora.Dictionary(df["review_text"])\n'

In [9]:
#load all the stuff
dictionary = corpora.Dictionary.load('models/words.dict')
corpus = corpora.MmCorpus.load('models/corpus.mm')
tfidf = gensim.models.tfidfmodel.TfidfModel.load('models/tfidf_model')
lsi = gensim.models.lsimodel.LsiModel.load('models/model.lsi')
index = similarities.MatrixSimilarity.load('models/corpus.index')
lda = gensim.models
#tfidf.save('tfidf_model')
lsi.save('models/model.lsi')
#tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=75)
corpus_lsi = lsi[corpus_tfidf]


---------------------------------------------------------------------------
UnpicklingError                           Traceback (most recent call last)
<ipython-input-9-2343d514bd53> in <module>()
      1 #load all the stuff
      2 dictionary = corpora.Dictionary.load('models/words.dict')
----> 3 corpus = corpora.MmCorpus.load('models/corpus.mm')
      4 tfidf = gensim.models.tfidfmodel.TfidfModel.load('models/tfidf_model')
      5 lsi = gensim.models.lsimodel.LsiModel.load('models/model.lsi')

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
    246         compress, subname = SaveLoad._adapt_by_suffix(fname)
    247 
--> 248         obj = unpickle(fname)
    249         obj._load_specials(fname, mmap, compress, subname)
    250         return obj

/Users/sheldon/anaconda/envs/capstone/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
    909     with smart_open(fname) as f:
    910         # Because of loading from S3 load can't be used (missing readline in smart_open)
--> 911         return _pickle.loads(f.read())
    912 
    913 

UnpicklingError: invalid load key, '%'.

In [8]:
def get_related_podcasts(index):
    def getKey(item):
        return item[1]
    listOfTopics = 
    corpus = corpus_lsi[index]
    corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
    related_df = pd.DataFrame(corpus,columns=['index','score'])
    final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
    return final_df

get_related_podcasts(1)


  File "<ipython-input-8-b3a4dec5989c>", line 4
    listOfTopics =
                   ^
SyntaxError: invalid syntax

In [11]:
lsi.show_topics(num_words=100)[1]
def getKey(item):
        return item[1]
sorted(corpus_lsi[17], key=getKey,reverse=True)
sorted(lsi.show_topic(8,topn=100), key=getKey, reverse=True)


Out[11]:
[(u'economics', 0.20075241030479216),
 (u'economists', 0.096918993194952341),
 (u'gender', 0.086645740787323997),
 (u'offender', 0.080986352314974794),
 (u'coin', 0.068897077342306143),
 (u'offenders', 0.068654664387510944),
 (u'marijuana', 0.068175380206256297),
 (u'apron', 0.066312516353626047),
 (u'financial', 0.062462980433143037),
 (u'education', 0.061449591429715587),
 (u'police', 0.061018826813492695),
 (u'petty', 0.059712896718308241),
 (u'preferences', 0.056998621035908949),
 (u'assaulting', 0.055874036704667569),
 (u'charities', 0.054803188035918111),
 (u'kidney', 0.0544317951725444),
 (u'officer', 0.052501476182249304),
 (u'caleb', 0.052433083247781884),
 (u'diploma', 0.050797805357103633),
 (u'registry', 0.050217514569610322),
 (u'giants', 0.050112168659068236),
 (u'driver', 0.048065446698679702),
 (u'reaganomics', 0.046414306760494413),
 (u'game', 0.045799533122943097),
 (u'economist', 0.045648828501652632),
 (u'crime', 0.044992693069287437),
 (u'diplomas', 0.044448079687465711),
 (u'radio', 0.044157869116420258),
 (u'currency', 0.043632198572000444),
 (u'cowboys', 0.043251296345787796),
 (u'utopia', 0.042731704755242149),
 (u'tommy', 0.042726015553238447),
 (u'steve', 0.042248189766779923),
 (u'alcohol', 0.042056066901713353),
 (u'games', 0.041866191450996126),
 (u'trophy', 0.041702718345644214),
 (u'carolina', 0.041576761668042375),
 (u'preference', 0.041406695996573532),
 (u'packers', 0.040655820422947908),
 (u'denver', 0.040584694736775735),
 (u'freak', 0.040476103140352888),
 (u'trophies', 0.0389693134498569),
 (u'thump', 0.038741618715308276),
 (u'bono', 0.038338917796152283),
 (u'firstborn', 0.037978874195838147),
 (u'hargreaves', 0.03764094449513268),
 (u'gift', 0.037495529218878103),
 (u'restrooms', 0.037270876644085002),
 (u'mortified', 0.036611151467359319),
 (u'pittsburgh', 0.036132237090812522),
 (u'chicago', 0.036118557649630158),
 (u'mortgages', 0.03608682634714637),
 (u'pornography', 0.036065406294734731),
 (u'morgan', 0.035849356936325023),
 (u'blog', 0.035848506091984936),
 (u'sex', 0.035681433053336195),
 (u'sister', 0.035546112638106628),
 (u'crimes', 0.035543037065394176),
 (u'minus', 0.035494555582051744),
 (u'tribune', 0.034926183742266491),
 (u'criminal', 0.034920818902150597),
 (u'charges', 0.03489386369348807),
 (u'markets', 0.034731668978308303),
 (u'jets', 0.033874664078435396),
 (u'degree', 0.033866707790161196),
 (u'dean', 0.033815812714591047),
 (u'stanley', 0.03370301132013459),
 (u'minnesota', 0.03333750931794318),
 (u'paddy', 0.033144301705865811),
 (u'sesame', 0.032815701742337736),
 (u'zoom', 0.032460020806240768),
 (u'embarking', -0.032504793891144711),
 (u'donald', -0.033636813452894584),
 (u'improving', -0.033735014880266589),
 (u'shane', -0.034054833765613128),
 (u'ladies', -0.03477144701118861),
 (u'genetic', -0.035370858950304208),
 (u'patient', -0.035657589553726811),
 (u'medication', -0.036161178193132575),
 (u'hype', -0.03704937890596121),
 (u'asia', -0.037416770020017948),
 (u'ha', -0.037651017605805283),
 (u'polls', -0.037791308688583568),
 (u'franklin', -0.037857829243672328),
 (u'diabetes', -0.04025621090744548),
 (u'black', -0.040731944805092277),
 (u'republican', -0.040976605810292319),
 (u'trump', -0.041859603230542157),
 (u'queen', -0.043730798342776406),
 (u'shame', -0.044976328399778727),
 (u'hap', -0.045146926700645996),
 (u'movies', -0.051521539418352212),
 (u'cream', -0.051756938493142189),
 (u'swiss', -0.056873965391394714),
 (u'philadelphia', -0.062535190303895261),
 (u'movie', -0.065527781760655185),
 (u'health', -0.065892805668891316),
 (u'patients', -0.097967991649763622),
 (u'vulnerability', -0.10969328881746618),
 (u'queens', -0.11206711629168782)]

In [246]:
top_topics


Out[246]:
[(8, 0.17896380730880518),
 (21, 0.091766397149891432),
 (11, 0.065616352648861648),
 (13, 0.064971534129887182),
 (18, 0.025042972421945916),
 (14, 0.024075613086501517),
 (6, 0.015937881772443523),
 (3, 0.01362197571894958),
 (5, 0.013074554462267355),
 (4, 0.0088382479607463076),
 (9, 0.0022015115732176059),
 (19, -0.027478503662382983),
 (1, -0.033732093283322856),
 (2, -0.03667141346621143),
 (17, -0.043455623803401247),
 (7, -0.043792188330803443),
 (15, -0.045263199178342921),
 (16, -0.045927792542153609),
 (12, -0.07568664639746138),
 (22, -0.081326541458310503),
 (24, -0.10229900073505233),
 (0, -0.11914869474184712),
 (10, -0.13419026138406509),
 (23, -0.20864550487239134),
 (20, -0.23951662981995908)]

In [216]:
corpus_lsi[1]


Out[216]:
[(0, -0.30666008595379457),
 (1, -0.11729049248029232),
 (2, 0.09465178780371751),
 (3, 0.1021867895086989),
 (4, -0.055071455362356046),
 (5, 0.042427461451872366),
 (6, -0.099190452301410284),
 (7, -0.15977382403279178),
 (8, -0.19050174225760189),
 (9, 0.11962812051264791),
 (10, -0.062988823786490372),
 (11, 0.15656530181930398),
 (12, -0.088286265017136933),
 (13, -0.068219332021130799),
 (14, 0.074681883889618314),
 (15, -0.010344141539310351),
 (16, -0.011351173648180354),
 (17, 0.045443599720397507),
 (18, 0.011356358861277709),
 (19, 0.020344898048179155),
 (20, 0.016910421609358444),
 (21, -0.007355589844098882),
 (22, -0.032113079486917967),
 (23, -0.047263684081847689),
 (24, 0.0025554615751745072)]

In [9]:
def get_related_podcasts(index):
    def getKey(item):
        return item[1]
    corpus = corpus_lsi[index]
    corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
    related_df = pd.DataFrame(corpus,columns=['index','score'])
    final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
    return final_df

related_podcasts = list(get_related_podcasts(1)['index'])

def get_topics_per_podcast(podcast_index):
    topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
    def get_topic_arrays(topic_ids):
        x = []
        for id in topic_ids:
            list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
            z = []
            for word in list_of_words:
                if word[1] > .05:
                    z.append(word)
            x.append(z)
        return x
    topic_arrays = get_topic_arrays(topic_ids)
    return topic_arrays
testing = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]

In [12]:
testing
x = pd.DataFrame(testing, columns=['index','words'])
x.words.ix[0]


Out[12]:
[[(u'tommy', 0.13349055465001164),
  (u'petrified', 0.13210484035640158),
  (u'elkins', 0.12485860653687339),
  (u'trump', 0.11205463919933509)],
 [(u'lakeview', 0.16230038683391426), (u'chandler', 0.13397388194476367)],
 [(u'dean', 0.15605057080987761), (u'police', 0.097709114659441917)],
 [(u'movie', 0.19523593887426033),
  (u'movies', 0.13411589375868901),
  (u'assaulting', 0.092371566597563279)]]

In [150]:
def get_related_podcasts(query):
    vec_box = dictionary.doc2bow(query.split())
    vec_lsi = lsi[vec_box]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])[0:10]
    related_df = pd.DataFrame(sims,columns=['index','score'])
    def get_related_podcasts_list(index):
        def getKey(item):
            return item[1]
        corpus = corpus_lsi[index]
        corpus = sorted(corpus, key=getKey, reverse=True)[0:10]
        related_df = pd.DataFrame(corpus,columns=['index','score'])
        final_df = pd.merge(related_df, df, on='index')[['index','episode','score','series']]
        return final_df

    related_podcasts = list(get_related_podcasts_list(1)['index'])

    def get_topics_per_podcast(podcast_index):
        topic_ids = [i for i in sorted(corpus_lsi[podcast_index], key=getKey, reverse=True) if i[1] > 0.10]
        def get_topic_arrays(topic_ids):
            x = []
            for id in topic_ids:
                list_of_words = sorted(lsi.show_topic(id[0], topn=5),key=getKey, reverse=True)
                z = []
                for word in list_of_words:
                    if word[1] > .05:
                        z.append(word)
                x.append(z)
            return x
        topic_arrays = get_topic_arrays(topic_ids)
        return topic_arrays
    topics_per_podcast = [[related_podcasts[i],get_topics_per_podcast(related_podcasts[i])] for i in range(0, len(related_podcasts))]
    other_df = pd.DataFrame(topics_per_podcast, columns=['topic_index','words'])
    final_df = pd.merge(related_df, df)
    test_final_df = pd.merge(other_df, final_df,left_index=True,right_index=True)[['words','index','score','episode','series']]
    return test_final_df

In [161]:
x = get_related_podcasts('cats')
zz = x.words.ix[0]

In [172]:
zz[1]


Out[172]:
[(u'lakeview', 0.16230038683391426), (u'chandler', 0.13397388194476367)]

In [151]:
test


Out[151]:
id rank
0 20 0.291095
1 29 0.225385
2 23 0.218698
3 30 0.197811
4 22 0.185274
5 17 0.174860
6 24 0.160806
7 34 0.159709
8 26 0.139874
9 28 0.128635
10 25 0.124274
11 21 0.119610
12 27 0.111168
13 32 0.108407
14 18 0.091393
15 31 0.054864
16 89 0.023582
17 33 0.022864
18 78 0.008958
19 19 -0.003617
20 62 -0.021017
21 119 -0.025039
22 74 -0.027536
23 121 -0.031408
24 137 -0.037055
25 68 -0.037749
26 115 -0.037889
27 122 -0.039599
28 99 -0.042506
29 65 -0.048880
... ... ...
121 47 -0.194450
122 0 -0.195210
123 3 -0.197204
124 107 -0.197657
125 45 -0.201181
126 12 -0.201359
127 8 -0.201887
128 101 -0.203043
129 149 -0.204315
130 112 -0.204922
131 54 -0.205141
132 50 -0.205235
133 150 -0.205884
134 147 -0.208164
135 1 -0.208597
136 53 -0.209838
137 9 -0.210375
138 52 -0.210404
139 108 -0.212018
140 104 -0.214856
141 148 -0.216398
142 102 -0.216993
143 7 -0.221978
144 5 -0.222895
145 109 -0.222942
146 86 -0.225827
147 105 -0.230408
148 106 -0.231272
149 111 -0.236596
150 103 -0.241512

151 rows × 2 columns


In [146]:
tf = TfidfVectorizer(stop_words=stop)
tfidf_matrix = tf.fit_transform(df['transcribed'])
copy_matrix = tf.transform(df['transcribed'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [130]:
query = 'python economics love'
trans_query = query.lower()
trans_query = query.split()
tfidf_matrix_test = tf.fit_transform(trans_query)
tfidf_matrix_train = tf.transform(df['transcribed'])
tfidf_matrix_train.todense()
tfidf_matrix_test.todense()
query_similarities = linear_kernel(tfidf_matrix_test, tfidf_matrix_train)
query_similarities = query_similarities.argsort()[0][::-1]
pod_dict = dict(zip(range(0, len(query_similarities)),query_similarities))
pod_dict = pd.DataFrame({'rank':pod_dict.keys()}, index=pod_dict.values())
#related_podcasts_df = pd.DataFrame.join(pod_dict, df, how='inner')
#final_df = related_podcasts_df.sort_values('rank')[1:11][['rank','episode','series']]
#related_podcasts = final_df['episode']

In [131]:
pod_dict


Out[131]:
rank
148 0
113 1
150 2
45 3
52 4
51 5
50 6
49 7
48 8
47 9
46 10
44 11
54 12
43 13
42 14
41 15
40 16
39 17
38 18
53 19
55 20
36 21
56 22
71 23
70 24
69 25
68 26
67 27
66 28
65 29
... ...
86 121
85 122
84 123
83 124
82 125
81 126
80 127
79 128
78 129
77 130
76 131
91 132
92 133
93 134
102 135
108 136
107 137
106 138
105 139
104 140
103 141
101 142
94 143
100 144
99 145
98 146
97 147
96 148
95 149
0 150

151 rows × 1 columns


In [ ]: